# importar seaborn para usar el dataset 'penguins'
import seaborn as sns
df = sns.load_dataset('penguins')
df
| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
|---|---|---|---|---|---|---|---|
| 0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
| 1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
| 2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
| 3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN |
| 4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 339 | Gentoo | Biscoe | NaN | NaN | NaN | NaN | NaN |
| 340 | Gentoo | Biscoe | 46.8 | 14.3 | 215.0 | 4850.0 | Female |
| 341 | Gentoo | Biscoe | 50.4 | 15.7 | 222.0 | 5750.0 | Male |
| 342 | Gentoo | Biscoe | 45.2 | 14.8 | 212.0 | 5200.0 | Female |
| 343 | Gentoo | Biscoe | 49.9 | 16.1 | 213.0 | 5400.0 | Male |
344 rows × 7 columns
# descripción del dataset
df.describe(include = 'all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| species | 344 | 3 | Adelie | 152 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| island | 344 | 3 | Biscoe | 168 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| bill_length_mm | 342.0 | NaN | NaN | NaN | 43.92193 | 5.459584 | 32.1 | 39.225 | 44.45 | 48.5 | 59.6 |
| bill_depth_mm | 342.0 | NaN | NaN | NaN | 17.15117 | 1.974793 | 13.1 | 15.6 | 17.3 | 18.7 | 21.5 |
| flipper_length_mm | 342.0 | NaN | NaN | NaN | 200.915205 | 14.061714 | 172.0 | 190.0 | 197.0 | 213.0 | 231.0 |
| body_mass_g | 342.0 | NaN | NaN | NaN | 4201.754386 | 801.954536 | 2700.0 | 3550.0 | 4050.0 | 4750.0 | 6300.0 |
| sex | 333 | 2 | Male | 168 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# exploración de faltantes
eda.plot_missing(df)
C:\Users\pmore\anaconda3\envs\pycaret3\lib\site-packages\dask\core.py:119: RuntimeWarning: invalid value encountered in divide return func(*(_execute_task(a, cache) for a in args))
| Missing Cells | 19 |
|---|---|
| Missing Cells (%) | 0.8% |
| Missing Columns | 5 |
| Missing Rows | 11 |
| Avg Missing Cells per Column | 2.71 |
| Avg Missing Cells per Row | 0.06 |
# eliminar líneas con faltantes y verificación adicional de faltantes
df.dropna(axis = 0, inplace = True)
eda.plot_missing(df)
C:\Users\pmore\anaconda3\envs\pycaret3\lib\site-packages\dask\core.py:119: RuntimeWarning: invalid value encountered in divide return func(*(_execute_task(a, cache) for a in args))
| Missing Cells | 0 |
|---|---|
| Missing Cells (%) | 0.0% |
| Missing Columns | 0 |
| Missing Rows | 0 |
| Avg Missing Cells per Column | 0.0 |
| Avg Missing Cells per Row | 0.0 |
sklearn¶# preparación de datos para entrenamiento y split
from sklearn.model_selection import train_test_split
y = df['species']
x = df.drop('species', axis = 1)
# partición del dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 100)
# preparación del pipeline de pre-procesado
numeric_features = x_train.select_dtypes(exclude = object).columns
categorial_features = x_train.select_dtypes(include = object).columns
# construcción del pipeline de preprocesado y sus transformaiones
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
numeric_imputer = SimpleImputer()
categorical_transf = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OneHotEncoder())
preprocess = ColumnTransformer(
transformers = [
('num', numeric_imputer, numeric_features),
('cat', categorical_transf, categorial_features)
]
)
# ajustando el pipeline a los datos de entrenamiento y muestra de la linea 210
preprocess.fit_transform(x_train)[210]
array([4.740e+01, 1.460e+01, 2.120e+02, 4.725e+03, 1.000e+00, 0.000e+00,
0.000e+00, 1.000e+00, 0.000e+00])
# creando pipeline de modelado con un Decision Tree
from sklearn.tree import DecisionTreeClassifier
pipeline = make_pipeline(preprocess, DecisionTreeClassifier())
# mostrando el pipeline completo
pipeline
Pipeline(steps=[('columntransformer',
ColumnTransformer(transformers=[('num', SimpleImputer(),
Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], dtype='object')),
('cat',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='most_frequent')),
('onehotencoder',
OneHotEncoder())]),
Index(['island', 'sex'], dtype='object'))])),
('decisiontreeclassifier', DecisionTreeClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('columntransformer',
ColumnTransformer(transformers=[('num', SimpleImputer(),
Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], dtype='object')),
('cat',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='most_frequent')),
('onehotencoder',
OneHotEncoder())]),
Index(['island', 'sex'], dtype='object'))])),
('decisiontreeclassifier', DecisionTreeClassifier())])ColumnTransformer(transformers=[('num', SimpleImputer(),
Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], dtype='object')),
('cat',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='most_frequent')),
('onehotencoder',
OneHotEncoder())]),
Index(['island', 'sex'], dtype='object'))])Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], dtype='object')
SimpleImputer()
Index(['island', 'sex'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder()
DecisionTreeClassifier()
# aplicando el pipeline a los datos de entrenamiento
pipeline.fit(x_train, y_train)
Pipeline(steps=[('columntransformer',
ColumnTransformer(transformers=[('num', SimpleImputer(),
Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], dtype='object')),
('cat',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='most_frequent')),
('onehotencoder',
OneHotEncoder())]),
Index(['island', 'sex'], dtype='object'))])),
('decisiontreeclassifier', DecisionTreeClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('columntransformer',
ColumnTransformer(transformers=[('num', SimpleImputer(),
Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], dtype='object')),
('cat',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='most_frequent')),
('onehotencoder',
OneHotEncoder())]),
Index(['island', 'sex'], dtype='object'))])),
('decisiontreeclassifier', DecisionTreeClassifier())])ColumnTransformer(transformers=[('num', SimpleImputer(),
Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], dtype='object')),
('cat',
Pipeline(steps=[('simpleimputer',
SimpleImputer(strategy='most_frequent')),
('onehotencoder',
OneHotEncoder())]),
Index(['island', 'sex'], dtype='object'))])Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], dtype='object')
SimpleImputer()
Index(['island', 'sex'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder()
DecisionTreeClassifier()
# generando la predicción del modelo
y_pred = pipeline.predict(x_test)
# evaluando el modelo creado
from sklearn.metrics import confusion_matrix, f1_score
f1_score(y_test, y_pred, average = 'micro')
0.9761904761904762
# visualizando una simple matriz de confusión
confusion_matrix(y_test, y_pred)
array([[32, 0, 2],
[ 0, 17, 0],
[ 0, 0, 33]], dtype=int64)
river¶# importando las funciones de river
from river import (
stream,
compose,
preprocessing,
evaluate,
metrics,
tree,
imblearn,
stats,
)
import numbers
# creando un diccionario para iterar un dataframe estático
for xi, yi in stream.iter_pandas(x, y):
pass
Dado que river está diseñado para operar con datos en stream -y no en batch-, debemos emular la entrada de observaciones (líneas de un dataframe estático) 1 a 1. Esto se consigue iterando un dataframe línea por línea.
Además, cada evento entrante a river (observación del dataframe) debe ser un diccionario, no un array o dataframe
# validación del último evento
xi
{'island': 'Biscoe',
'bill_length_mm': 49.9,
'bill_depth_mm': 16.1,
'flipper_length_mm': 213.0,
'body_mass_g': 5400.0,
'sex': 'Male'}
# validación de la última linea del dataframe
x.loc[343, ]
island Biscoe bill_length_mm 49.9 bill_depth_mm 16.1 flipper_length_mm 213.0 body_mass_g 5400.0 sex Male Name: 343, dtype: object
# creación de los pre-procesados para variables numéricas y categorías con `river`
cat = (
compose.SelectType(object)
| preprocessing.StatImputer()
| preprocessing.OneHotEncoder(sparse=True)
)
num = compose.SelectType(numbers.Number) | preprocessing.StatImputer()
# creación del pipeline de pre-procesado y aplicación a cada iteración
preprocess_xi = num + cat
preprocess_xi.learn_one(xi)
xi_transformed = preprocess_xi.transform_one(xi)
xi_transformed
{'island_Biscoe': 1,
'bill_length_mm_49.9': 1,
'bill_depth_mm_16.1': 1,
'flipper_length_mm_213.0': 1,
'body_mass_g_5400.0': 1,
'sex_Male': 1,
'bill_length_mm': 49.9,
'bill_depth_mm': 16.1,
'flipper_length_mm': 213.0,
'body_mass_g': 5400.0}
# creación del modelo online Decision Tree
clf = tree.HoeffdingTreeClassifier()
clf.summary
{'n_nodes': None,
'n_branches': None,
'n_leaves': None,
'n_active_leaves': 0,
'n_inactive_leaves': 0,
'height': 0,
'total_observed_weight': 0.0}
# creación del pipeline completo en river usando una función (UDF)
def get_pipeline():
cat = (
compose.SelectType(object)
| preprocessing.StatImputer()
| preprocessing.OneHotEncoder(sparse=True)
)
num = compose.SelectType(numbers.Number) | preprocessing.StatImputer()
preprocess_xi = num + cat
classifier = tree.HoeffdingTreeClassifier()
return preprocess_xi | classifier
# UDF para entrenar el modelo y aplicar todo el pipeline a los datos en 'tiempo real' (emulando la iteración del dataframe)
def train(x, y):
pipeline = get_pipeline()
# initialize metrics
f1_score = metrics.MicroF1()
cm = metrics.ConfusionMatrix()
f1_scores = []
# iterate over dataset
for xi, yi in stream.iter_pandas(x, y, shuffle = True, seed = 100):
# predict on the new sample
yi_pred = pipeline.predict_one(xi)
# get the score
if yi_pred is not None:
f1_score.update(yi, yi_pred)
f1_scores.append(f1_score.get() * 100)
cm.update(yi, yi_pred)
# train the model with new sample
pipeline.learn_one(xi, yi)
return f1_scores, cm, pipeline
f1_scores, cm, pipeline = train(x, y)
# visualizando el pipeline creado en `river` para clasificación en tiempo real
pipeline
Select(Number)
Select (
<class 'numbers.Number'>
)
StatImputer
(
imputers=()
)
Select(object)
Select (
<class 'object'>
)
StatImputer
(
imputers=()
)
OneHotEncoder
(
sparse=True
)
HoeffdingTreeClassifier
(
grace_period=200
max_depth=inf
split_criterion="info_gain"
delta=1e-07
tau=0.05
leaf_prediction="nba"
nb_threshold=0
nominal_attributes=None
splitter=GaussianSplitter (
n_splits=10
)
binary_split=False
max_size=100.
memory_estimate_period=1000000
stop_mem_management=False
remove_poor_attrs=False
merit_preprune=True
)
# mostrando los parámetros del clasificador
pipeline.steps['HoeffdingTreeClassifier'].summary
{'n_nodes': 1,
'n_branches': 0,
'n_leaves': 1,
'n_active_leaves': 1,
'n_inactive_leaves': 0,
'height': 1,
'total_observed_weight': 333.0}
# matrix de confusión del clasificador construido con `river`
cm
Adelie Chinstrap Gentoo Adelie 139 5 1 Chinstrap 10 58 0 Gentoo 3 0 116
# visualización del rendimiento del modelo con cada iteración
import matplotlib.pyplot as plt
def plot(scores: list):
plt.figure(figsize=(8, 6), dpi=80)
iters = range(len(scores))
ax = sns.lineplot(x=iters, y=scores)
ax.set(xlabel="num_iters", ylabel="score")
plt.show()
plot(f1_scores)
el modelo alcanza el nivel óptimo de puntuación a las 100 iteraciones, lo que aumenta el rendimiento y minimiza el uso de recursos de sistema